#Reading Files
fritolay=read.csv("https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%2014%20and%2015%20Case%20Study%202/CaseStudy2-data.csv")
# all columns
names(fritolay)
## [1] "ID" "Age"
## [3] "Attrition" "BusinessTravel"
## [5] "DailyRate" "Department"
## [7] "DistanceFromHome" "Education"
## [9] "EducationField" "EmployeeCount"
## [11] "EmployeeNumber" "EnvironmentSatisfaction"
## [13] "Gender" "HourlyRate"
## [15] "JobInvolvement" "JobLevel"
## [17] "JobRole" "JobSatisfaction"
## [19] "MaritalStatus" "MonthlyIncome"
## [21] "MonthlyRate" "NumCompaniesWorked"
## [23] "Over18" "OverTime"
## [25] "PercentSalaryHike" "PerformanceRating"
## [27] "RelationshipSatisfaction" "StandardHours"
## [29] "StockOptionLevel" "TotalWorkingYears"
## [31] "TrainingTimesLastYear" "WorkLifeBalance"
## [33] "YearsAtCompany" "YearsInCurrentRole"
## [35] "YearsSinceLastPromotion" "YearsWithCurrManager"
#Missing values
missing=sapply(fritolay, function(x) sum(is.na(x))) #no missing values
#data with descriptive values
char_data=fritolay[, sapply(fritolay, class) == 'character'] #9 char values
#data with continuous values
cont_data=fritolay[, sapply(fritolay, class) != 'character'] #27 num values
# unique values in each descriptive value
unique_col=char_data %>% summarize_all(n_distinct)
unique_col=gather(unique_col, Columns_names, Unique_Values, Attrition:OverTime) %>% arrange(Unique_Values)
unique_col
## Columns_names Unique_Values
## 1 Over18 1
## 2 Attrition 2
## 3 Gender 2
## 4 OverTime 2
## 5 BusinessTravel 3
## 6 Department 3
## 7 MaritalStatus 3
## 8 EducationField 6
## 9 JobRole 9
# Only JobRole and EducationField have more than 3 unique characters
# unique var for all char variable
unique_chars=sapply(char_data, function(x) (unique(x))) #no empty strings or or anything
#remove Over18 there is only one value. Not useful
fritolay=fritolay %>% select(-Over18)
Percent distribution
Features vs attrition percent
#———————————————————————
Insights for categorical variables
Almost 50% of the data with stock option a zero tells that many people opt out or are not given the option of Stock-option.
Per the amount of zero’s in promotion, it is showing that within a year many people have been promoted.
Per the zero years with current manager and year in role, we can see that there are quite a few new workers
Let’s see distribution and stats of everything
#summary(cont_data)
#are there duplicates?
fritolay[duplicated(fritolay)] # No Duplicates
## data frame with 0 columns and 870 rows
#remove col
fritolay=fritolay %>% select(-ID, -EmployeeNumber, -EmployeeCount, -NumCompaniesWorked, -StandardHours)
## Looking at those with NumCompnaiesWorked as 0
#fritolay %>% filter(NumCompaniesWorked==0)
#fritolay %>% filter(StockOptionLevel==0)
#fritolay %>% filter(TotalWorkingYears==0) #----->>>They seem to be new workers. They are 7, 18-19 years old, single with relationship between 3-4,age 18-19, companies worked 1 and it is their first company
#fritolay %>% filter(TrainingTimesLastYear==0)
#fritolay %>% filter(YearsAtCompany==0)
#fritolay %>% filter(YearsInCurrentRole==0)
#YearsSinceLastPromotion, yearswithcurrmanager, YearsInCurrentRole all 0
Normal trends that should appear in every company, are shown in the company - Monthly Income vs Job Level - Performance Rate vs Percent Salary Hike level - Monthly Income vs Total Years working - total working years and age - Job level vs monthly income - Monthly income vs total working years - Relationship vs Years With manager - Performance rating vs training - Jobrole vs Worklife balance - Performance Rating vs Relationship with manager - Job level and monthly income vs Total working years .78
library(car)
library(lmtest)
library("car")
library(splitstackshape)
set.seed(20)#set seed
TrainObs = sample(seq(1,dim(fritolay)[1]),round(.75*dim(fritolay)[1]),replace = FALSE) #75% split
salaryTrain = fritolay[TrainObs,]
salaryTest = fritolay[-TrainObs,]
RMSE_store=c() #Sore all RMSE to compare
Model1_ = lm(MonthlyIncome ~ .-Department-Attrition, data = salaryTrain)
#Using with all the data
#vif(Model1_) #check collinearity
#The variance inflation is high for JobRole. Due to the strong collinarity between Jobrole and another indepent variable, I need to remove it from model
#let's see the score though
### Important variables: YearsSinceLastPromotion, TotalWorkingYears,PerformanceRating, MonthlyRate,JobRoleLaboratory Technician,JobRoleManager ,JobRoleResearch Scientist, JobRoleResearch Director , JobLevel, BusinessTravelTravel_Rarely
Model1_ = lm(MonthlyIncome ~ .-Department-JobRole-Attrition, data = salaryTrain) #removing JobRole from the model as it has high collinarity
#vif(Model1_) #check collinearity again
#that is better so we can continue
#summary(Model1_)
#r sqr .90 not bad
Model1_Preds = predict(Model1_, newdata = salaryTest)#Predict
MSPE = mean((salaryTest$MonthlyIncome - Model1_Preds)^2)
RMSE <- function(error) { sqrt(mean(error^2)) }
RMSE(Model1_$residuals)#get RMSE
## [1] 1338.657
#1338.657
RMSE_store=c(RMSE_store,RMSE(Model1_$residuals))#store value
#Second try using most important values based on model one's p values
Model2_ = lm(MonthlyIncome~TotalWorkingYears+JobLevel+DistanceFromHome+YearsWithCurrManager,data=salaryTrain)
#vif(Model2_) #no collinearity between the dependent variables
#summary(Model2_)
#R .92
Model2_Preds = predict(Model2_, newdata = salaryTest) #predict
MSPE_2 = mean((salaryTest$MonthlyIncome - Model2_Preds)^2)
RMSE <- function(error) { sqrt(mean(error^2)) }
RMSE(Model2_$residuals) #get RMSE
## [1] 1382.347
#1351
#Try model 2.5 using LOOCV
train(MonthlyIncome~TotalWorkingYears+JobLevel+DistanceFromHome+YearsWithCurrManager, method ="lm", data=fritolay, trControl = trainControl(method="LOOCV"))
## Linear Regression
##
## 870 samples
## 4 predictor
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 869, 869, 869, 869, 869, 869, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 1377.913 0.9100788 1038.864
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
#RMSE 1377
#Model 3
Model3_ = lm(MonthlyIncome~TotalWorkingYears+I(TotalWorkingYears^2)+JobLevel+I(JobLevel^2)+DistanceFromHome+YearsWithCurrManager+I(YearsWithCurrManager^2), data=salaryTrain)
#vif(Model3_)#collinearity increased so it won't be a good model as we fails the assumptions of linear model
#summary(Model3_) #R .92
Model3_Preds = predict(Model3_, newdata = salaryTest)
MSPE_3 = mean((salaryTest$MonthlyIncome - Model3_Preds)^2)
RMSE <- function(error) { sqrt(mean(error^2)) }
RMSE(Model3_$residuals)
## [1] 1264.69
#1259 <--better RMSE but too high correlation
#plot(Model3_)
RMSE_store=c(RMSE_store,RMSE(Model3_$residuals)) #save RMSE
#Model4 is better
train(MonthlyIncome~TotalWorkingYears+I(TotalWorkingYears^2)+JobLevel+I(JobLevel^2)+DistanceFromHome+I(DistanceFromHome^2)+YearsWithCurrManager+I(YearsWithCurrManager^2), method ="lm", data=fritolay, trControl = trainControl(method="LOOCV"))
## Linear Regression
##
## 870 samples
## 4 predictor
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 869, 869, 869, 869, 869, 869, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 1286.825 0.9215757 942.0208
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
#RMSE 1286
OverTimeYes < 2e-16 ***
JobInvolvement 1.20e-07 ***
JobSatisfaction 4.79e-06 ***
MonthlyIncome 0.000154 ***
MaritalStatusSingle 0.000412 ***
YearsSinceLastPromotion 0.004467 **
Graph of Linear model results
new=salaryTest
new$Predictions=Model2_Preds
new %>% ggplot(aes(salaryTest$MonthlyIncome,Predictions)) + geom_point() + geom_smooth()+ ggtitle("Actual vs predictions")+ xlab("Actual")+
ylab("Predicted")
RMSE_store=c(RMSE_store,RMSE(Model2_$residuals))#store value
#Now let's predict with our best model with not too much collinearity Model 2
NoSalary=read.csv("https://raw.githubusercontent.com/alanabadzic/CaseStudy2DDS/main/CaseStudy2CompSet%20No%20Salary.csv")
Model2_Preds = predict(Model2_, newdata = NoSalary)
NoSalary$MonthlyIncome=Model2_Preds
NoSalary=NoSalary %>% select(ID,MonthlyIncome)
write.csv(NoSalary,file="Case2PredictionsAbadzicAhumadaSalary.csv", row.names=FALSE) #creates new CSV with results
#Changing "Attrition" variable to a binary value
fritolay$AttritionBin<-ifelse(fritolay$Attrition=="Yes",1,0)
#get an idea on most important variables using lenear regression utilizing the categorical column of attrition
Model1_ = lm(AttritionBin ~ .-Attrition, data = fritolay)
#summary(Model1_)
#let's create a model with them and check
t2=lm(AttritionBin ~ OverTime+JobInvolvement+JobSatisfaction+MonthlyIncome+MaritalStatus+YearsSinceLastPromotion, data = fritolay)
#summary(t2)
#vif(t2) #not toomuch correlaiton! great! let's start with these variables
##Explore the density of Monthly income to standarize it
#ggplot(fritolay, aes(MonthlyIncome)) + geom_density(fill="blue")
#ggplot(fritolay, aes(log(MonthlyIncome))) + geom_density(fill="blue")
#ggplot(fritolay, aes(sqrt(MonthlyIncome))) + geom_density(fill="blue")
#Standarizing Monthly income with log as it seem to give the best distribution
fritolay$logMonthlyIncome<-log(fritolay$MonthlyIncome)
#Testing Accuracy of Naive Bayes model
library(e1071)
splitPerc = .70 #split 70%
trainIndices = sample(1:dim(fritolay)[1],round(splitPerc*dim(fritolay)[1]))
train = fritolay[trainIndices,]
test = fritolay[-trainIndices,]
#Test 1
model = naiveBayes(train,train$Attrition)
prediction = table(predict(model,test),test$Attrition)
CM = confusionMatrix(prediction, positive = "Yes")
CM
## Confusion Matrix and Statistics
##
##
## No Yes
## No 220 0
## Yes 0 41
##
## Accuracy : 1
## 95% CI : (0.986, 1)
## No Information Rate : 0.8429
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.1571
## Detection Rate : 0.1571
## Detection Prevalence : 0.1571
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : Yes
##
#test 2
model2=naiveBayes(Attrition ~ OverTime+JobInvolvement+JobSatisfaction+MonthlyIncome+MaritalStatus+YearsSinceLastPromotion+logMonthlyIncome+EnvironmentSatisfaction+DistanceFromHome+WorkLifeBalance+JobRole+RelationshipSatisfaction, data=fritolay)
prediction2 = table(predict(model2,test),test$Attrition)
CM = confusionMatrix(prediction2, positive = "Yes")
CM
## Confusion Matrix and Statistics
##
##
## No Yes
## No 217 13
## Yes 3 28
##
## Accuracy : 0.9387
## 95% CI : (0.9024, 0.9646)
## No Information Rate : 0.8429
## P-Value [Acc > NIR] : 1.887e-06
##
## Kappa : 0.743
##
## Mcnemar's Test P-Value : 0.02445
##
## Sensitivity : 0.6829
## Specificity : 0.9864
## Pos Pred Value : 0.9032
## Neg Pred Value : 0.9435
## Prevalence : 0.1571
## Detection Rate : 0.1073
## Detection Prevalence : 0.1188
## Balanced Accuracy : 0.8346
##
## 'Positive' Class : Yes
##
#Predict on unlabeled data
NoAttrition=read.csv("https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%2014%20and%2015%20Case%20Study%202/CaseStudy2CompSet%20No%20Attrition.csv")
Attrition_Preds = predict(model2,NoAttrition)
NoAttrition$Attrition=Attrition_Preds
NoAttrition=NoAttrition %>% select(ID,Attrition)
#Create CSV
write.csv(NoAttrition,file="Case2PredictionsAbadzicAhumadaAttrition.csv", row.names=FALSE)
#read.csv("NoAttritionResult.csv")
#Verifying top 3 factors for attrition
#CM = confusionMatrix(prediction, positive = "Yes")
#m1 <- glm(AttritionBin~JobSatisfaction+OverTime+JobInvolvement, #family=binomial, data=fritolay)
#summary(m1)
library(corrplot)
#pred <- Model1_ %>%predict_classes(salaryTest) %>% factor(0:8)
#help(predict_class)
#res_tab <- table(Pred = pred, Act = test_labels)
#res_prop <- prop.table(res_tab,2)
#author_key <- tibble(author = nt_frame$author, code = nt_frame$author_factor) %>%
# unique %>%
# arrange(code)
#colnames(res_prop) <- author_key$author
#rownames(res_prop) <- author_key$author
#corrplot(res_prop,is.corr = FALSE,
# method = "circle", addCoef.col = "lightblue", number.cex = 0.7)
unique(fritolay %>% filter(Department=="Sales") %>% select(JobRole))
## JobRole
## 1 Sales Executive
## 4 Sales Representative
## 9 Manager
unique(fritolay %>% filter(Department=="Research & Development") %>% select(JobRole))
## JobRole
## 1 Research Director
## 2 Manufacturing Director
## 3 Research Scientist
## 6 Healthcare Representative
## 7 Manager
## 14 Laboratory Technician
unique(fritolay %>% filter(Department=="Human Resources") %>% select(JobRole))
## JobRole
## 1 Human Resources
## 3 Manager
# Manager Appears on both Sales and Research Development so lets update the name
fritolay[fritolay=="Manager" & fritolay$Department=="Sales"]="Sales Manager"
fritolay[fritolay=="Manager" & fritolay$Department=="Human Resources"]="HR Manager"
library(viridis)
library("ggsci")
# Discrete color
fritolay %>% ggplot(aes (y=EducationField, fill=Department)) + geom_bar() +
ylab("Education Field") +
ggtitle("Educational field vs Department")
fritolay %>% group_by(EducationField) %>% summarize(Total_people=n(), Sales=paste((0+(round(((sum(Department=="Sales")/n())*100),1))),"%",sep=""), Research=paste((0+(round(((sum(Department=="Research & Development")/n())*100),1))),"%", sep=""), HR=paste((0+(round(((sum(Department=="Human Resources")/n())*100),1))),"%", sep=""))
## # A tibble: 6 × 5
## EducationField Total_people Sales Research HR
## <chr> <int> <chr> <chr> <chr>
## 1 Human Resources 15 0% 0% 100%
## 2 Life Sciences 358 26.5% 70.9% 2.5%
## 3 Marketing 100 100% 0% 0%
## 4 Medical 270 17.4% 79.6% 3%
## 5 Other 52 23.1% 75% 1.9%
## 6 Technical Degree 75 25.3% 72% 2.7%
All people that studied Marketing are in sales, just like all people that studied human resources are in human resources. However thoe who had education in: Other, Technical, Science, Medicine work also work in other fields.
17% of those that study medicine ended up going into Sales
2.7% that did Technical deegree ended up in human resources which is surprising
Rest of graphs to understand data
#new_day==cont_data %>% select("DailyRate")
#does not equals daily Rate!
ggplot(fritolay, aes(x=Age, fill=as.factor(JobSatisfaction))) + geom_bar()
ggplot(fritolay, aes(y=YearsSinceLastPromotion,color=as.factor(JobSatisfaction))) + geom_boxplot()
ggplot(fritolay, aes(x=YearsSinceLastPromotion, fill=as.factor(JobSatisfaction))) + geom_bar()
cor.test(x=fritolay$JobSatisfaction, y=fritolay$YearsSinceLastPromotion,method = "spearman")
##
## Spearman's rank correlation rho
##
## data: fritolay$JobSatisfaction and fritolay$YearsSinceLastPromotion
## S = 110395779, p-value = 0.8625
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.005880834
#no correclation btw years an las promotion
ggplot(fritolay, aes(x=YearsSinceLastPromotion, y=JobSatisfaction, color=as.factor(JobSatisfaction))) + geom_col()
#environment satisfaction vs jobsatisfaction
ggplot(fritolay, aes(x=EnvironmentSatisfaction, y=JobSatisfaction, color=as.factor(JobSatisfaction))) + geom_col()
cor.test(x=fritolay$EnvironmentSatisfaction, y=fritolay$JobSatisfaction,method = "spearman")
##
## Spearman's rank correlation rho
##
## data: fritolay$EnvironmentSatisfaction and fritolay$JobSatisfaction
## S = 111511310, p-value = 0.6365
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.01604509
#no correlation
ggplot(fritolay, aes(x=EnvironmentSatisfaction, fill=as.factor(JobSatisfaction))) + geom_bar()
ggplot(fritolay, aes(y=EnvironmentSatisfaction, fill=as.factor(JobSatisfaction))) + geom_boxplot()
ggplot(fritolay, aes(y=(JobRole), fill=as.factor(JobSatisfaction))) + geom_bar()
ggplot(fritolay, aes(y=(WorkLifeBalance), fill=as.factor(Attrition))) + geom_bar()
#no coee
l=fritolay %>% group_by(JobRole) %>% summarize(one=round(((sum(JobSatisfaction==1)/n())*100),1),
two=round(((sum(JobSatisfaction==2)/n())*100),1),
three=round(((sum(JobSatisfaction==3)/n())*100),1),
four=round(((sum(JobSatisfaction==4)/n())*100),1))
ggplot(fritolay, aes(y= as.factor(JobRole), group=JobSatisfaction)) +
geom_bar(aes(x = ..prop.., fill = factor(..y..)), stat="count") +
geom_text(size=2,aes( label = scales::percent(..prop..),
x= ..prop.. ), stat= "count", vjust = -.3) +
labs(y = "Percent") +
facet_grid(~JobSatisfaction)+ scale_x_continuous(labels = scales::percent) +
scale_y_discrete(labels = function(x) str_wrap(x, width = 3))
#there seems to not be a relationship between environment and job satisfaction
#monthly rate + job_satisfaction
ggplot(fritolay, aes(y=MonthlyRate, color=as.factor(JobSatisfaction))) + geom_boxplot()
#the satisfaction for 3 and 4 have a higher mean monthly rate than that of 1 and 2
f=fritolay %>% group_by(EnvironmentSatisfaction) %>% summarize(one=round(((sum(JobSatisfaction==1)/n())*100),1),
two=round(((sum(JobSatisfaction==2)/n())*100),1),
three=round(((sum(JobSatisfaction==3)/n())*100),1),
four=round(((sum(JobSatisfaction==4)/n())*100),1))